In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
In [46]:
df = pd.read_csv("data/Pay_Scale_data.csv", low_memory=False)
In [47]:
df.head()
Out[47]:
rank
name
Anual%_ROI_without_FA
Anual%_ROI_with_FA
20yrnet_ROI_without_FA
20yrnet_ROI_with_FA
Total_4yr_cost
grad_rate
years_to_grad
loan_amnt
%_GrantMoney_Received
0
-
University of Phoenix - Phoenix, AZ(Private)
NaN
NaN
NaN
NaN
NaN
16
6
33520
91
1
-
Cleary University(Private)
NaN
NaN
NaN
NaN
NaN
44
4
24360
70
2
-
ITT Technical Institute - San Diego, CA(Private)
NaN
NaN
NaN
NaN
NaN
27
6
34120
70
3
-
Wilmington University(Private)
NaN
NaN
NaN
NaN
NaN
39
4
12680
73
4
-
Thomas Edison State College(In-State)
NaN
NaN
NaN
NaN
NaN
0
NaN
0
0
In [48]:
df = df.dropna()
In [49]:
df.head()
Out[49]:
rank
name
Anual%_ROI_without_FA
Anual%_ROI_with_FA
20yrnet_ROI_without_FA
20yrnet_ROI_with_FA
Total_4yr_cost
grad_rate
years_to_grad
loan_amnt
%_GrantMoney_Received
5
1
Harvey Mudd College(Private)
8.7
12.6
985300
1104500
237700
91
4
21920
71
6
2
California Institute of Technology (Caltech)(P...
8.6
13.4
901400
1029700
221600
93
4
22160
54
7
3
Stevens Institute of Technology(Private)
8.1
11.5
841000
948300
232000
79
5
44000
97
8
4
Colorado School of Mines(In-State)
11.4
13.5
831000
866200
112000
70
5
30480
72
9
5
Babson College(Private)
8.0
12.8
812800
946500
230200
91
4
31880
45
In [50]:
df.describe()
Out[50]:
Total_4yr_cost
grad_rate
years_to_grad
loan_amnt
%_GrantMoney_Received
count
1217.000000
1217.000000
1217.000000
1217.000000
1217.000000
mean
130077.156943
56.564503
4.517666
27865.242399
77.032046
std
47645.786880
17.124524
0.516082
5603.649815
17.735255
min
46500.000000
11.000000
4.000000
9680.000000
25.000000
25%
89500.000000
43.000000
4.000000
23960.000000
63.000000
50%
121700.000000
55.000000
5.000000
27360.000000
78.000000
75%
163000.000000
69.000000
5.000000
31080.000000
94.000000
max
245000.000000
98.000000
6.000000
51600.000000
100.000000
In [51]:
train_X = df[['Total_4yr_cost', 'grad_rate', 'years_to_grad', 'loan_amnt', '%_GrantMoney_Received']]
In [52]:
train_Y = df['Anual%_ROI_without_FA']
In [53]:
model = RandomForestRegressor(random_state=0, n_estimators=100, max_depth=50)
In [43]:
abs(np.mean(cross_val_score(model, train_X, train_Y, cv=5, scoring='mean_squared_error')))**0.5
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-43-162832098a69> in <module>()
----> 1 abs(np.mean(cross_val_score(model, train_X, train_Y, cv=5, scoring='mean_squared_error')))**0.5
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1441 train, test, verbose, None,
1442 fit_params)
-> 1443 for train, test in cv)
1444 return np.array(scores)[:, 0]
1445
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
803 self._iterating = True
804
--> 805 while self.dispatch_one_batch(iterator):
806 pass
807
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
661 return False
662 else:
--> 663 self._dispatch(tasks)
664 return True
665
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569
570 if self._pool is None:
--> 571 job = ImmediateComputeBatch(batch)
572 self._jobs.append(job)
573 self.n_dispatched_batches += 1
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
182 # Don't delay the application, to avoid keeping the input
183 # arguments in memory
--> 184 self.results = batch()
185
186 def get(self):
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
71
72 def __call__(self):
---> 73 return [func(*args, **kwargs) for func, args, kwargs in self.items]
74
75 def __len__(self):
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
71
72 def __call__(self):
---> 73 return [func(*args, **kwargs) for func, args, kwargs in self.items]
74
75 def __len__(self):
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1539 estimator.fit(X_train, **fit_params)
1540 else:
-> 1541 estimator.fit(X_train, y_train, **fit_params)
1542
1543 except Exception as e:
/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
236
237 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
--> 238 y = np.ascontiguousarray(y, dtype=DOUBLE)
239
240 if expanded_class_weight is not None:
/Users/nandini/anaconda/lib/python3.3/site-packages/numpy/core/numeric.py in ascontiguousarray(a, dtype)
548
549 """
--> 550 return array(a, dtype, copy=False, order='C', ndmin=1)
551
552 def asfortranarray(a, dtype=None):
ValueError: could not convert string to float: '<-15.0'
In [ ]:
Content source: juanshishido/college-scorecard
Similar notebooks: